Linux Cubed Series 2: Applications

home *** CD-ROM | disk | FTP | other *** search

/ Linux Cubed Series 2: Applications / Linux Cubed Series 2 - Applications.iso / misc / ispell-3.001 / ispell-3~ / ispell-3.1 / buildhash.c next >

Wrap

C/C++ Source or Header | 1995-01-23 | 17KB | 686 lines

#ifndef lint static char Rcs_Id[] = "$Id: buildhash.c,v 1.64 1995/01/08 23:23:26 geoff Exp $"; #endif #define MAIN /* * buildhash.c - make a hash table for okspell * * Pace Willisson, 1983 * * Copyright 1992, 1993, Geoff Kuenning, Granada Hills, CA * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All modifications to the source code must be clearly marked as * such. Binary redistributions based on modified source code * must be clearly marked as modified versions in the documentation * and/or other materials provided with the distribution. * 4. All advertising materials mentioning features or use of this software * must display the following acknowledgment: * This product includes software developed by Geoff Kuenning and * other unpaid contributors. * 5. The name of Geoff Kuenning may not be used to endorse or promote * products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY GEOFF KUENNING AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL GEOFF KUENNING OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $Log: buildhash.c,v $ * Revision 1.64 1995/01/08 23:23:26 geoff * Make the various file suffixes configurable for DOS purposes. * * Revision 1.63 1994/10/26 05:12:25 geoff * Get rid of some duplicate declarations. * * Revision 1.62 1994/07/28 05:11:33 geoff * Log message for previous revision: distinguish a zero count from a bad * count file. * * Revision 1.61 1994/07/28 04:53:30 geoff * * Revision 1.60 1994/01/25 07:11:18 geoff * Get rid of all old RCS log lines in preparation for the 3.1 release. * */ #include "config.h" #include "ispell.h" #include "proto.h" #include "msgs.h" #include "version.h" #include <ctype.h> #include <sys/stat.h> int main P ((int argc, char * argv[])); static void output P ((void)); static void filltable P ((void)); VOID * mymalloc P ((unsigned int size)); VOID * myrealloc P ((VOID * ptr, unsigned int size, unsigned int oldsize)); void myfree P ((VOID * ptr)); static void readdict P ((void)); static void newcount P ((void)); #define NSTAT 100 /* Size probe-statistics table */ struct stat dstat; /* Result of stat-ing dict file */ struct stat cstat; /* Result of stat-ing count file */ char * Dfile; /* Name of dictionary file */ char * Hfile; /* Name of hash (output) file */ char * Lfile; /* Name of language file */ char Cfile[MAXPATHLEN]; /* Name of count file */ char Sfile[MAXPATHLEN]; /* Name of statistics file */ static int silent = 0; /* NZ to suppress count reports */ int main (argc, argv) int argc; char * argv[]; { int avg; FILE * countf; FILE * statf; int stats[NSTAT]; int i; int j; while (argc > 1 && *argv[1] == '-') { argc--; argv++; switch (argv[0][1]) { case 's': silent = 1; break; } } if (argc == 4) { Dfile = argv[1]; Lfile = argv[2]; Hfile = argv[3]; } else { (void) fprintf (stderr, BHASH_C_USAGE); return 1; } if (yyopen (Lfile)) /* Open the language file */ return 1; yyinit (); /* Set up for the parse */ if (yyparse ()) /* Parse the language tables */ exit (1); (void) sprintf (Cfile, "%s%s", Dfile, COUNTSUFFIX); (void) sprintf (Sfile, "%s%s", Dfile, STATSUFFIX); if (stat (Dfile, &dstat) < 0) { (void) fprintf (stderr, BHASH_C_NO_DICT, Dfile); exit (1); } if (stat (Cfile, &cstat) < 0 || dstat.st_mtime > cstat.st_mtime) newcount (); if ((countf = fopen (Cfile, "r")) == NULL) { (void) fprintf (stderr, BHASH_C_NO_COUNT); exit (1); } hashsize = 0; if (fscanf (countf, "%d", &hashsize) != 1 || fclose (countf) == EOF) { (void) fprintf (stderr, BHASH_C_BAD_COUNT); exit (1); } if (hashsize == 0) { (void) fprintf (stderr, BHASH_C_ZERO_COUNT); exit (1); } readdict (); if ((statf = fopen (Sfile, "w")) == NULL) { (void) fprintf (stderr, CANT_CREATE, Sfile); exit (1); } for (i = 0; i < NSTAT; i++) stats[i] = 0; for (i = 0; i < hashsize; i++) { struct dent * dp; dp = &hashtbl[i]; if ((dp->flagfield & USED) != 0) { for (j = 0; dp != NULL; j++, dp = dp->next) { if (j >= NSTAT) j = NSTAT - 1; stats[j]++; } } } for (i = 0, j = 0, avg = 0; i < NSTAT; i++) { j += stats[i]; avg += stats[i] * (i + 1); if (j == 0) (void) fprintf (statf, "%d:\t%d\t0\t0.0\n", i + 1, stats[i]); else (void) fprintf (statf, "%d:\t%d\t%d\t%f\n", i + 1, stats[i], j, (double) avg / j); } (void) fclose (statf); filltable (); output (); return 0; } static void output () { register FILE * houtfile; register struct dent * dp; int strptr; int n; int i; int maxplen; int maxslen; struct flagent * fentry; if ((houtfile = fopen (Hfile, "wb")) == NULL) { (void) fprintf (stderr, CANT_CREATE, Hfile); return; } hashheader.stringsize = 0; hashheader.lstringsize = 0; hashheader.tblsize = hashsize; (void) fwrite ((char *) &hashheader, sizeof hashheader, 1, houtfile); strptr = 0; /* ** Put out the strings from the flags table. This code assumes that ** the size of the hash header is a multiple of the size of ichar_t, ** and that any integer can be converted to an (ichar_t *) and back ** without damage. */ maxslen = 0; for (i = numsflags, fentry = sflaglist; --i >= 0; fentry++) { if (fentry->stripl) { (void) fwrite ((char *) fentry->strip, fentry->stripl + 1, sizeof (ichar_t), houtfile); fentry->strip = (ichar_t *) strptr; strptr += (fentry->stripl + 1) * sizeof (ichar_t); } if (fentry->affl) { (void) fwrite ((char *) fentry->affix, fentry->affl + 1, sizeof (ichar_t), houtfile); fentry->affix = (ichar_t *) strptr; strptr += (fentry->affl + 1) * sizeof (ichar_t); } n = fentry->affl - fentry->stripl; if (n < 0) n = -n; if (n > maxslen) maxslen = n; } maxplen = 0; for (i = numpflags, fentry = pflaglist; --i >= 0; fentry++) { if (fentry->stripl) { (void) fwrite ((char *) fentry->strip, fentry->stripl + 1, sizeof (ichar_t), houtfile); fentry->strip = (ichar_t *) strptr; strptr += (fentry->stripl + 1) * sizeof (ichar_t); } if (fentry->affl) { (void) fwrite ((char *) fentry->affix, fentry->affl + 1, sizeof (ichar_t), houtfile); fentry->affix = (ichar_t *) strptr; strptr += (fentry->affl + 1) * sizeof (ichar_t); } n = fentry->affl - fentry->stripl; if (n < 0) n = -n; if (n > maxplen) maxplen = n; } /* ** Write out the string character type tables. */ hashheader.strtypestart = strptr; for (i = 0; i < hashheader.nstrchartype; i++) { n = strlen (chartypes[i].name) + 1; (void) fwrite (chartypes[i].name, n, 1, houtfile); strptr += n; n = strlen (chartypes[i].deformatter) + 1; (void) fwrite (chartypes[i].deformatter, n, 1, houtfile); strptr += n; for (n = 0; chartypes[i].suffixes[n] != '\0'; n += strlen (&chartypes[i].suffixes[n]) + 1) ; n++; (void) fwrite (chartypes[i].suffixes, n, 1, houtfile); strptr += n; } hashheader.lstringsize = strptr; /* We allow one extra byte because missingletter() may add one byte */ maxslen += maxplen + 1; if (maxslen > MAXAFFIXLEN) { (void) fprintf (stderr, BHASH_C_BAFF_1 (MAXAFFIXLEN, maxslen - MAXAFFIXLEN)); (void) fprintf (stderr, BHASH_C_BAFF_2); } /* Put out the dictionary strings */ for (i = 0, dp = hashtbl; i < hashsize; i++, dp++) { if (dp->word == NULL) dp->word = (char *) -1; else { n = strlen (dp->word) + 1; (void) fwrite (dp->word, n, 1, houtfile); dp->word = (char *) strptr; strptr += n; } } /* Pad file to a struct dent boundary for efficiency. */ n = (strptr + sizeof hashheader) % sizeof (struct dent); if (n != 0) { n = sizeof (struct dent) - n; strptr += n; while (--n >= 0) (void) putc ('\0', houtfile); } /* Put out the hash table itself */ for (i = 0, dp = hashtbl; i < hashsize; i++, dp++) { if (dp->next != 0) { int x; x = dp->next - hashtbl; dp->next = (struct dent *)x; } else { dp->next = (struct dent *)-1; } #ifdef PIECEMEAL_HASH_WRITES (void) fwrite ((char *) dp, sizeof (struct dent), 1, houtfile); #endif /* PIECEMEAL_HASH_WRITES */ } #ifndef PIECEMEAL_HASH_WRITES (void) fwrite ((char *) hashtbl, sizeof (struct dent), hashsize, houtfile); #endif /* PIECEMEAL_HASH_WRITES */ /* Put out the language tables */ (void) fwrite ((char *) sflaglist, sizeof (struct flagent), numsflags, houtfile); hashheader.stblsize = numsflags; (void) fwrite ((char *) pflaglist, sizeof (struct flagent), numpflags, houtfile); hashheader.ptblsize = numpflags; /* Finish filling in the hash header. */ hashheader.stringsize = strptr; rewind (houtfile); (void) fwrite ((char *) &hashheader, sizeof hashheader, 1, houtfile); (void) fclose (houtfile); } static void filltable () { struct dent *freepointer, *nextword, *dp; struct dent *hashend; int i; int overflows; hashend = hashtbl + hashsize; for (freepointer = hashtbl; (freepointer->flagfield & USED) && freepointer < hashend; freepointer++) ; overflows = 0; for (nextword = hashtbl, i = hashsize; i != 0; nextword++, i--) { if ((nextword->flagfield & USED) == 0) continue; if (nextword->next >= hashtbl && nextword->next < hashend) continue; dp = nextword; while (dp->next) { if (freepointer >= hashend) { overflows++; break; } else { *freepointer = *(dp->next); dp->next = freepointer; dp = freepointer; while ((freepointer->flagfield & USED) && freepointer < hashend) freepointer++; } } } if (overflows) (void) fprintf (stderr, BHASH_C_OVERFLOW, overflows); } #if MALLOC_INCREMENT == 0 VOID * mymalloc (size) unsigned int size; { return malloc (size); } /* ARGSUSED */ VOID * myrealloc (ptr, size, oldsize) VOID * ptr; unsigned int size; unsigned int oldsize; { return realloc (ptr, size); } void myfree (ptr) VOID * ptr; { free (ptr); } #else VOID * mymalloc (size) /* Fast, unfree-able variant of malloc */ unsigned int size; { VOID * retval; static int bytesleft = 0; static VOID * nextspace; if (size < 4) size = 4; size = (size + 7) & ~7; /* Assume doubleword boundaries are enough */ if (bytesleft < size) { bytesleft = (size < MALLOC_INCREMENT) ? MALLOC_INCREMENT : size; nextspace = malloc ((unsigned) bytesleft); if (nextspace == NULL) { bytesleft = 0; return NULL; } } retval = nextspace; nextspace = (VOID *) ((char *) nextspace + size); bytesleft -= size; return retval; } VOID * myrealloc (ptr, size, oldsize) VOID * ptr; unsigned int size; unsigned int oldsize; { VOID *nptr; nptr = mymalloc (size); if (nptr == NULL) return NULL; (void) bcopy (ptr, nptr, oldsize); return nptr; } /* ARGSUSED */ void myfree (ptr) VOID * ptr; { } #endif static void readdict () { struct dent d; register struct dent * dp; struct dent * lastdp; char lbuf[INPUTWORDLEN + MAXAFFIXLEN + 2 * MASKBITS]; char ucbuf[INPUTWORDLEN + MAXAFFIXLEN + 2 * MASKBITS]; FILE * dictf; int i; int h; if ((dictf = fopen (Dfile, "r")) == NULL) { (void) fprintf (stderr, BHASH_C_CANT_OPEN_DICT); exit (1); } hashtbl = (struct dent *) calloc ((unsigned) hashsize, sizeof (struct dent)); if (hashtbl == NULL) { (void) fprintf (stderr, BHASH_C_NO_SPACE); exit (1); } i = 0; while (fgets (lbuf, sizeof lbuf, dictf) != NULL) { if (!silent && (i % 1000) == 0) { (void) fprintf (stderr, "%d ", i); (void) fflush (stdout); } i++; if (makedent (lbuf, sizeof lbuf, &d) < 0) continue; h = hash (strtosichar (d.word, 1), hashsize); dp = &hashtbl[h]; if ((dp->flagfield & USED) == 0) { *dp = d; #ifndef NO_CAPITALIZATION_SUPPORT /* ** If it's a followcase word, we need to make this a ** special dummy entry, and add a second with the ** correct capitalization. */ if (captype (d.flagfield) == FOLLOWCASE) { if (addvheader (dp)) exit (1); } #endif } else { /* ** Collision. Skip to the end of the collision ** chain, or to a pre-existing entry for this ** word. Note that d.word always exists at ** this point. */ (void) strcpy (ucbuf, d.word); chupcase (ucbuf); while (dp != NULL) { if (strcmp (dp->word, ucbuf) == 0) break; #ifndef NO_CAPITALIZATION_SUPPORT while (dp->flagfield & MOREVARIANTS) dp = dp->next; #endif /* NO_CAPITALIZATION_SUPPORT */ dp = dp->next; } if (dp != NULL) { /* ** A different capitalization is already in ** the dictionary. Combine capitalizations. */ if (combinecaps (dp, &d) < 0) exit (1); } else { /* Insert a new word into the dictionary */ for (dp = &hashtbl[h]; dp->next != NULL; ) dp = dp->next; lastdp = dp; dp = (struct dent *) mymalloc (sizeof (struct dent)); if (dp == NULL) { (void) fprintf (stderr, BHASH_C_COLLISION_SPACE); exit (1); } *dp = d; lastdp->next = dp; dp->next = NULL; #ifndef NO_CAPITALIZATION_SUPPORT /* ** If it's a followcase word, we need to make this a ** special dummy entry, and add a second with the ** correct capitalization. */ if (captype (d.flagfield) == FOLLOWCASE) { if (addvheader (dp)) exit (1); } #endif } } } if (!silent) (void) fprintf (stderr, "\n"); (void) fclose (dictf); } static void newcount () { char buf[INPUTWORDLEN + MAXAFFIXLEN + 2 * MASKBITS]; #ifndef NO_CAPITALIZATION_SUPPORT ichar_t ibuf[INPUTWORDLEN + MAXAFFIXLEN + 2 * MASKBITS]; #endif register FILE * d; register int i; #ifndef NO_CAPITALIZATION_SUPPORT ichar_t lastibuf[sizeof ibuf / sizeof (ichar_t)]; int headercounted; int followcase; register char * cp; #endif if (!silent) (void) fprintf (stderr, BHASH_C_COUNTING); if ((d = fopen (Dfile, "r")) == NULL) { (void) fprintf (stderr, BHASH_C_CANT_OPEN_DICT); exit (1); } #ifndef NO_CAPITALIZATION_SUPPORT headercounted = 0; lastibuf[0] = 0; #endif for (i = 0; fgets (buf, sizeof buf, d); ) { if ((++i % 1000) == 0 && !silent) { (void) fprintf (stderr, "%d ", i); (void) fflush (stdout); } #ifndef NO_CAPITALIZATION_SUPPORT cp = index (buf, hashheader.flagmarker); if (cp != NULL) *cp = '\0'; if (strtoichar (ibuf, buf, INPUTWORDLEN * sizeof (ichar_t), 1)) (void) fprintf (stderr, WORD_TOO_LONG (buf)); followcase = (whatcap (ibuf) == FOLLOWCASE); upcase (ibuf); if (icharcmp (ibuf, lastibuf) != 0) headercounted = 0; else if (!headercounted) { /* First duplicate will take two entries */ if ((++i % 1000) == 0 && !silent) { (void) fprintf (stderr, "%d ", i); (void) fflush (stdout); } headercounted = 1; } if (!headercounted && followcase) { /* It's followcase and the first entry -- count again */ if ((++i % 1000) == 0 && !silent) { (void) fprintf (stderr, "%d ", i); (void) fflush (stdout); } headercounted = 1; } (void) icharcpy (lastibuf, ibuf); #endif } (void) fclose (d); if (!silent) (void) fprintf (stderr, BHASH_C_WORD_COUNT, i); if ((d = fopen (Cfile, "w")) == NULL) { (void) fprintf (stderr, CANT_CREATE, Cfile); exit (1); } (void) fprintf (d, "%d\n", i); (void) fclose (d); }